#import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from itertools import chain
import os
os.getcwd()
'C:\\Users\\Reena\\Documents\\Capital_One\\Data Challenge'
# enable to scroll up and down
pd.set_option("display.max_rows", None)
#Libraries for plotting
import plotly as py
import plotly.express as px
import plotly.io as pio
import plotly.offline as pyoff
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt
# loading zillow dataset
file_name = 'Zip_Zhvi_2bedroom.csv'
zillow_df = pd.read_csv(file_name)
#zillow_df.head() # checking the first five rows of dataset
# Filter the dataframe to include data related to New York city
zillow_ny_df = zillow_df[zillow_df['City']=='New York']
zillow_ny_df.head(5) # # checking the first five rows of dataset to confirm if we have data about New York City only
| RegionID | RegionName | City | State | Metro | CountyName | SizeRank | 1996-04 | 1996-05 | 1996-06 | ... | 2016-09 | 2016-10 | 2016-11 | 2016-12 | 2017-01 | 2017-02 | 2017-03 | 2017-04 | 2017-05 | 2017-06 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 61627 | 10013 | New York | NY | New York | New York | 1744 | NaN | NaN | NaN | ... | 3347100 | 3305500 | 3261100 | 3244000.0 | 3231400 | 3183300 | 3160200 | 3193500 | 3262200 | 3316500 |
| 1 | 61628 | 10014 | New York | NY | New York | New York | 379 | NaN | NaN | NaN | ... | 2480800 | 2443200 | 2430100 | 2452900.0 | 2451200 | 2441900 | 2460900 | 2494900 | 2498400 | 2491600 |
| 2 | 61625 | 10011 | New York | NY | New York | New York | 15 | NaN | NaN | NaN | ... | 2354000 | 2355500 | 2352200 | 2332100.0 | 2313300 | 2319600 | 2342100 | 2365900 | 2419700 | 2480400 |
| 6 | 61617 | 10003 | New York | NY | New York | New York | 21 | NaN | NaN | NaN | ... | 1932800 | 1930400 | 1937500 | 1935100.0 | 1915700 | 1916500 | 1965700 | 2045300 | 2109100 | 2147000 |
| 7 | 61637 | 10023 | New York | NY | New York | New York | 3 | NaN | NaN | NaN | ... | 1993500 | 1980700 | 1960900 | 1951300.0 | 1937800 | 1929800 | 1955000 | 2022400 | 2095000 | 2142300 |
5 rows × 262 columns
# To check the number of columns and rows of our zillow dataframe after filtering
zillow_ny_df.shape
(25, 262)
We have 25 rows i.e 25 unique zip codes and 262 columns that includes monthly median price from the year April 1996 till June 2017
# Changed the datatype of RegionName from integer to string datatype
data_types_dict = {'RegionName': str}
zillow_ny_df = zillow_ny_df.astype(data_types_dict)
# To check the datatype of each columns of our dataframe
zillow_ny_df.dtypes
RegionID int64 RegionName object City object State object Metro object CountyName object SizeRank int64 1996-04 float64 1996-05 float64 1996-06 float64 1996-07 float64 1996-08 float64 1996-09 float64 1996-10 float64 1996-11 float64 1996-12 float64 1997-01 float64 1997-02 float64 1997-03 float64 1997-04 float64 1997-05 float64 1997-06 float64 1997-07 float64 1997-08 float64 1997-09 float64 1997-10 float64 1997-11 float64 1997-12 float64 1998-01 float64 1998-02 float64 1998-03 float64 1998-04 float64 1998-05 float64 1998-06 float64 1998-07 float64 1998-08 float64 1998-09 float64 1998-10 float64 1998-11 float64 1998-12 float64 1999-01 float64 1999-02 float64 1999-03 float64 1999-04 float64 1999-05 float64 1999-06 float64 1999-07 float64 1999-08 float64 1999-09 float64 1999-10 float64 1999-11 float64 1999-12 float64 2000-01 float64 2000-02 float64 2000-03 float64 2000-04 float64 2000-05 float64 2000-06 float64 2000-07 float64 2000-08 float64 2000-09 float64 2000-10 float64 2000-11 float64 2000-12 float64 2001-01 float64 2001-02 float64 2001-03 float64 2001-04 float64 2001-05 float64 2001-06 float64 2001-07 float64 2001-08 float64 2001-09 float64 2001-10 float64 2001-11 float64 2001-12 float64 2002-01 float64 2002-02 float64 2002-03 float64 2002-04 float64 2002-05 float64 2002-06 float64 2002-07 float64 2002-08 float64 2002-09 float64 2002-10 float64 2002-11 float64 2002-12 float64 2003-01 float64 2003-02 float64 2003-03 float64 2003-04 float64 2003-05 float64 2003-06 float64 2003-07 float64 2003-08 float64 2003-09 float64 2003-10 float64 2003-11 float64 2003-12 float64 2004-01 float64 2004-02 float64 2004-03 float64 2004-04 float64 2004-05 float64 2004-06 float64 2004-07 float64 2004-08 float64 2004-09 float64 2004-10 float64 2004-11 float64 2004-12 float64 2005-01 float64 2005-02 float64 2005-03 float64 2005-04 float64 2005-05 float64 2005-06 float64 2005-07 float64 2005-08 float64 2005-09 float64 2005-10 float64 2005-11 float64 2005-12 float64 2006-01 float64 2006-02 float64 2006-03 float64 2006-04 float64 2006-05 float64 2006-06 float64 2006-07 float64 2006-08 float64 2006-09 float64 2006-10 float64 2006-11 float64 2006-12 float64 2007-01 float64 2007-02 float64 2007-03 float64 2007-04 float64 2007-05 float64 2007-06 float64 2007-07 float64 2007-08 float64 2007-09 float64 2007-10 float64 2007-11 float64 2007-12 float64 2008-01 float64 2008-02 float64 2008-03 float64 2008-04 float64 2008-05 float64 2008-06 float64 2008-07 float64 2008-08 float64 2008-09 float64 2008-10 float64 2008-11 float64 2008-12 float64 2009-01 float64 2009-02 float64 2009-03 float64 2009-04 float64 2009-05 float64 2009-06 float64 2009-07 float64 2009-08 float64 2009-09 float64 2009-10 float64 2009-11 float64 2009-12 float64 2010-01 float64 2010-02 float64 2010-03 float64 2010-04 float64 2010-05 float64 2010-06 float64 2010-07 float64 2010-08 float64 2010-09 float64 2010-10 float64 2010-11 float64 2010-12 float64 2011-01 float64 2011-02 float64 2011-03 float64 2011-04 float64 2011-05 float64 2011-06 float64 2011-07 float64 2011-08 float64 2011-09 float64 2011-10 float64 2011-11 float64 2011-12 float64 2012-01 float64 2012-02 float64 2012-03 float64 2012-04 float64 2012-05 float64 2012-06 float64 2012-07 float64 2012-08 float64 2012-09 float64 2012-10 float64 2012-11 float64 2012-12 float64 2013-01 float64 2013-02 float64 2013-03 float64 2013-04 float64 2013-05 float64 2013-06 float64 2013-07 float64 2013-08 int64 2013-09 int64 2013-10 int64 2013-11 int64 2013-12 int64 2014-01 int64 2014-02 int64 2014-03 int64 2014-04 int64 2014-05 int64 2014-06 int64 2014-07 int64 2014-08 int64 2014-09 int64 2014-10 int64 2014-11 int64 2014-12 int64 2015-01 int64 2015-02 int64 2015-03 int64 2015-04 int64 2015-05 int64 2015-06 int64 2015-07 int64 2015-08 int64 2015-09 int64 2015-10 int64 2015-11 int64 2015-12 int64 2016-01 int64 2016-02 float64 2016-03 float64 2016-04 float64 2016-05 float64 2016-06 int64 2016-07 int64 2016-08 int64 2016-09 int64 2016-10 int64 2016-11 int64 2016-12 float64 2017-01 int64 2017-02 int64 2017-03 int64 2017-04 int64 2017-05 int64 2017-06 int64 dtype: object
# Defining a fucntion to determine the percent of missing values in each column of the dataset
def missing_column_values(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
missing_value_df = pd.DataFrame({'Column_name': df.columns,
'Missing_Value': mis_val,
'Missing_Value_Percent': mis_val_percent}).reset_index()
return(missing_value_df)
# calling missing_column_values function to compute the percent of missing values in each column of the dataset
Zillow_df_with_missingvalues = missing_column_values(zillow_ny_df)
Zillow_df_with_missingvalues[Zillow_df_with_missingvalues['Missing_Value_Percent'] > 0]
| index | Column_name | Missing_Value | Missing_Value_Percent | |
|---|---|---|---|---|
| 7 | 1996-04 | 1996-04 | 17 | 68.0 |
| 8 | 1996-05 | 1996-05 | 17 | 68.0 |
| 9 | 1996-06 | 1996-06 | 17 | 68.0 |
| 10 | 1996-07 | 1996-07 | 17 | 68.0 |
| 11 | 1996-08 | 1996-08 | 17 | 68.0 |
| 12 | 1996-09 | 1996-09 | 17 | 68.0 |
| 13 | 1996-10 | 1996-10 | 17 | 68.0 |
| 14 | 1996-11 | 1996-11 | 17 | 68.0 |
| 15 | 1996-12 | 1996-12 | 17 | 68.0 |
| 16 | 1997-01 | 1997-01 | 17 | 68.0 |
| 17 | 1997-02 | 1997-02 | 17 | 68.0 |
| 18 | 1997-03 | 1997-03 | 17 | 68.0 |
| 19 | 1997-04 | 1997-04 | 17 | 68.0 |
| 20 | 1997-05 | 1997-05 | 17 | 68.0 |
| 21 | 1997-06 | 1997-06 | 17 | 68.0 |
| 22 | 1997-07 | 1997-07 | 17 | 68.0 |
| 23 | 1997-08 | 1997-08 | 17 | 68.0 |
| 24 | 1997-09 | 1997-09 | 17 | 68.0 |
| 25 | 1997-10 | 1997-10 | 17 | 68.0 |
| 26 | 1997-11 | 1997-11 | 17 | 68.0 |
| 27 | 1997-12 | 1997-12 | 17 | 68.0 |
| 28 | 1998-01 | 1998-01 | 17 | 68.0 |
| 29 | 1998-02 | 1998-02 | 17 | 68.0 |
| 30 | 1998-03 | 1998-03 | 17 | 68.0 |
| 31 | 1998-04 | 1998-04 | 17 | 68.0 |
| 32 | 1998-05 | 1998-05 | 17 | 68.0 |
| 33 | 1998-06 | 1998-06 | 17 | 68.0 |
| 34 | 1998-07 | 1998-07 | 17 | 68.0 |
| 35 | 1998-08 | 1998-08 | 17 | 68.0 |
| 36 | 1998-09 | 1998-09 | 17 | 68.0 |
| 37 | 1998-10 | 1998-10 | 16 | 64.0 |
| 38 | 1998-11 | 1998-11 | 16 | 64.0 |
| 39 | 1998-12 | 1998-12 | 16 | 64.0 |
| 40 | 1999-01 | 1999-01 | 16 | 64.0 |
| 41 | 1999-02 | 1999-02 | 16 | 64.0 |
| 42 | 1999-03 | 1999-03 | 16 | 64.0 |
| 43 | 1999-04 | 1999-04 | 16 | 64.0 |
| 44 | 1999-05 | 1999-05 | 16 | 64.0 |
| 45 | 1999-06 | 1999-06 | 16 | 64.0 |
| 46 | 1999-07 | 1999-07 | 16 | 64.0 |
| 47 | 1999-08 | 1999-08 | 16 | 64.0 |
| 48 | 1999-09 | 1999-09 | 16 | 64.0 |
| 49 | 1999-10 | 1999-10 | 16 | 64.0 |
| 50 | 1999-11 | 1999-11 | 16 | 64.0 |
| 51 | 1999-12 | 1999-12 | 16 | 64.0 |
| 52 | 2000-01 | 2000-01 | 16 | 64.0 |
| 53 | 2000-02 | 2000-02 | 16 | 64.0 |
| 54 | 2000-03 | 2000-03 | 16 | 64.0 |
| 55 | 2000-04 | 2000-04 | 16 | 64.0 |
| 56 | 2000-05 | 2000-05 | 16 | 64.0 |
| 57 | 2000-06 | 2000-06 | 16 | 64.0 |
| 58 | 2000-07 | 2000-07 | 16 | 64.0 |
| 59 | 2000-08 | 2000-08 | 16 | 64.0 |
| 60 | 2000-09 | 2000-09 | 16 | 64.0 |
| 61 | 2000-10 | 2000-10 | 16 | 64.0 |
| 62 | 2000-11 | 2000-11 | 16 | 64.0 |
| 63 | 2000-12 | 2000-12 | 16 | 64.0 |
| 64 | 2001-01 | 2001-01 | 16 | 64.0 |
| 65 | 2001-02 | 2001-02 | 16 | 64.0 |
| 66 | 2001-03 | 2001-03 | 16 | 64.0 |
| 67 | 2001-04 | 2001-04 | 16 | 64.0 |
| 68 | 2001-05 | 2001-05 | 16 | 64.0 |
| 69 | 2001-06 | 2001-06 | 16 | 64.0 |
| 70 | 2001-07 | 2001-07 | 16 | 64.0 |
| 71 | 2001-08 | 2001-08 | 16 | 64.0 |
| 72 | 2001-09 | 2001-09 | 16 | 64.0 |
| 73 | 2001-10 | 2001-10 | 16 | 64.0 |
| 74 | 2001-11 | 2001-11 | 16 | 64.0 |
| 75 | 2001-12 | 2001-12 | 16 | 64.0 |
| 76 | 2002-01 | 2002-01 | 16 | 64.0 |
| 77 | 2002-02 | 2002-02 | 17 | 68.0 |
| 78 | 2002-03 | 2002-03 | 17 | 68.0 |
| 79 | 2002-04 | 2002-04 | 17 | 68.0 |
| 80 | 2002-05 | 2002-05 | 17 | 68.0 |
| 81 | 2002-06 | 2002-06 | 17 | 68.0 |
| 82 | 2002-07 | 2002-07 | 17 | 68.0 |
| 83 | 2002-08 | 2002-08 | 17 | 68.0 |
| 84 | 2002-09 | 2002-09 | 17 | 68.0 |
| 85 | 2002-10 | 2002-10 | 17 | 68.0 |
| 86 | 2002-11 | 2002-11 | 17 | 68.0 |
| 87 | 2002-12 | 2002-12 | 17 | 68.0 |
| 88 | 2003-01 | 2003-01 | 17 | 68.0 |
| 89 | 2003-02 | 2003-02 | 17 | 68.0 |
| 90 | 2003-03 | 2003-03 | 17 | 68.0 |
| 91 | 2003-04 | 2003-04 | 17 | 68.0 |
| 92 | 2003-05 | 2003-05 | 17 | 68.0 |
| 93 | 2003-06 | 2003-06 | 17 | 68.0 |
| 94 | 2003-07 | 2003-07 | 17 | 68.0 |
| 95 | 2003-08 | 2003-08 | 17 | 68.0 |
| 96 | 2003-09 | 2003-09 | 17 | 68.0 |
| 97 | 2003-10 | 2003-10 | 17 | 68.0 |
| 98 | 2003-11 | 2003-11 | 16 | 64.0 |
| 99 | 2003-12 | 2003-12 | 14 | 56.0 |
| 100 | 2004-01 | 2004-01 | 13 | 52.0 |
| 101 | 2004-02 | 2004-02 | 13 | 52.0 |
| 102 | 2004-03 | 2004-03 | 12 | 48.0 |
| 103 | 2004-04 | 2004-04 | 12 | 48.0 |
| 104 | 2004-05 | 2004-05 | 12 | 48.0 |
| 105 | 2004-06 | 2004-06 | 10 | 40.0 |
| 106 | 2004-07 | 2004-07 | 10 | 40.0 |
| 107 | 2004-08 | 2004-08 | 10 | 40.0 |
| 108 | 2004-09 | 2004-09 | 5 | 20.0 |
| 109 | 2004-10 | 2004-10 | 4 | 16.0 |
| 110 | 2004-11 | 2004-11 | 4 | 16.0 |
| 111 | 2004-12 | 2004-12 | 4 | 16.0 |
| 112 | 2005-01 | 2005-01 | 4 | 16.0 |
| 113 | 2005-02 | 2005-02 | 4 | 16.0 |
| 114 | 2005-03 | 2005-03 | 4 | 16.0 |
| 115 | 2005-04 | 2005-04 | 3 | 12.0 |
| 116 | 2005-05 | 2005-05 | 3 | 12.0 |
| 117 | 2005-06 | 2005-06 | 3 | 12.0 |
| 118 | 2005-07 | 2005-07 | 3 | 12.0 |
| 119 | 2005-08 | 2005-08 | 3 | 12.0 |
| 120 | 2005-09 | 2005-09 | 3 | 12.0 |
| 121 | 2005-10 | 2005-10 | 1 | 4.0 |
| 122 | 2005-11 | 2005-11 | 1 | 4.0 |
| 123 | 2005-12 | 2005-12 | 1 | 4.0 |
| 124 | 2006-01 | 2006-01 | 1 | 4.0 |
| 125 | 2006-02 | 2006-02 | 1 | 4.0 |
| 126 | 2006-03 | 2006-03 | 1 | 4.0 |
| 127 | 2006-04 | 2006-04 | 1 | 4.0 |
| 128 | 2006-05 | 2006-05 | 1 | 4.0 |
| 129 | 2006-06 | 2006-06 | 1 | 4.0 |
| 130 | 2006-07 | 2006-07 | 1 | 4.0 |
| 131 | 2006-08 | 2006-08 | 1 | 4.0 |
| 132 | 2006-09 | 2006-09 | 1 | 4.0 |
| 133 | 2006-10 | 2006-10 | 1 | 4.0 |
| 134 | 2006-11 | 2006-11 | 1 | 4.0 |
| 135 | 2006-12 | 2006-12 | 1 | 4.0 |
| 136 | 2007-01 | 2007-01 | 1 | 4.0 |
| 137 | 2007-02 | 2007-02 | 1 | 4.0 |
| 138 | 2007-03 | 2007-03 | 1 | 4.0 |
| 139 | 2007-04 | 2007-04 | 1 | 4.0 |
| 140 | 2007-05 | 2007-05 | 1 | 4.0 |
The above table shows the percent of missing values in each column of the dataset. I am eliminating those columns where Missing_Value_Percent is above 0% from further analysis and retaining the remaining columns
# dropping all the columns with null value(s)
zillow_ny_df.dropna(axis=1, inplace = True)
# Calling missing_column_values function again to check if there are any columns with misssing values.
# Printing any columns that have missing values.
Zillow_df_with_missingvalues = missing_column_values(zillow_ny_df)
Zillow_df_with_missingvalues[Zillow_df_with_missingvalues['Missing_Value_Percent'] > 0].count()
index 0 Column_name 0 Missing_Value 0 Missing_Value_Percent 0 dtype: int64
The above result shows that we have successfully eliminated all the columns with null values
# creating a new dataframe containing only the median price for each zip code in NYC from 2007-06:2017-06
medianprice_NY_df = zillow_ny_df.drop(['RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName','SizeRank'], axis=1)
# Computing the median price for properties in each zip code of NYC for each year from June 2007 till June 2017
medianprice_NY_df= medianprice_NY_df.groupby(np.arange(len(medianprice_NY_df.columns))//12, axis=1).median()
cols = pd.date_range('2007', '2018', freq='Y')
cols = cols.strftime('%Y')
# assignming 'Year' as the column name to the yearly median price per zipcode in NYC dataframe
medianprice_NY_df.columns = cols
medianprice_NY_df.head()
| 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2269050.0 | 2358750.0 | 2250650.0 | 2284650.0 | 2431200.0 | 2681800.0 | 2914650.0 | 3073800.0 | 3184300.0 | 3261650.0 | 3316500.0 |
| 1 | 1694500.0 | 1767150.0 | 1504700.0 | 1612000.0 | 1660500.0 | 1769800.0 | 2168650.0 | 2082000.0 | 2328250.0 | 2452050.0 | 2491600.0 |
| 2 | 1622800.0 | 1698900.0 | 1431050.0 | 1442950.0 | 1492650.0 | 1647550.0 | 1907150.0 | 2027400.0 | 2148200.0 | 2337100.0 | 2480400.0 |
| 6 | 1486700.0 | 1582450.0 | 1285200.0 | 1368700.0 | 1352050.0 | 1422450.0 | 1685900.0 | 1721150.0 | 1912600.0 | 1936300.0 | 2147000.0 |
| 7 | 1531350.0 | 1625300.0 | 1323000.0 | 1383000.0 | 1379700.0 | 1380450.0 | 1610150.0 | 1724750.0 | 1838000.0 | 1957950.0 | 2142300.0 |
The above two chunks of code were executed to compute yearly median price for every zip code in New York City. Our initial zillow dataset consisted of monthly median price which would have been cumbersome in performing further analysis on the dataset. So, I computed yearly median price for every zip code in NYC for the past 10 years. I selected median over mean to arrive at the average cost price in each zip code of NYC for the following two reasons:
# Concatenating the yearly median prices for each year from 2007-2017 with their respective zip codes
medianprice_NY_df= pd.concat([zillow_ny_df['RegionName'], medianprice_NY_df], axis=1)
medianprice_NY_df.head()
| RegionName | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10013 | 2269050.0 | 2358750.0 | 2250650.0 | 2284650.0 | 2431200.0 | 2681800.0 | 2914650.0 | 3073800.0 | 3184300.0 | 3261650.0 | 3316500.0 |
| 1 | 10014 | 1694500.0 | 1767150.0 | 1504700.0 | 1612000.0 | 1660500.0 | 1769800.0 | 2168650.0 | 2082000.0 | 2328250.0 | 2452050.0 | 2491600.0 |
| 2 | 10011 | 1622800.0 | 1698900.0 | 1431050.0 | 1442950.0 | 1492650.0 | 1647550.0 | 1907150.0 | 2027400.0 | 2148200.0 | 2337100.0 | 2480400.0 |
| 6 | 10003 | 1486700.0 | 1582450.0 | 1285200.0 | 1368700.0 | 1352050.0 | 1422450.0 | 1685900.0 | 1721150.0 | 1912600.0 | 1936300.0 | 2147000.0 |
| 7 | 10023 | 1531350.0 | 1625300.0 | 1323000.0 | 1383000.0 | 1379700.0 | 1380450.0 | 1610150.0 | 1724750.0 | 1838000.0 | 1957950.0 | 2142300.0 |
medianprice_NY_df.columns
Index(['RegionName', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017'],
dtype='object')
# Transforming the dataframe: medianprice_NY_df.
NY_medianprice_df= medianprice_NY_df.melt(id_vars=["RegionName"],var_name='Year', value_name='MedianValue')
NY_medianprice_df.head()
| RegionName | Year | MedianValue | |
|---|---|---|---|
| 0 | 10013 | 2007 | 2269050.0 |
| 1 | 10014 | 2007 | 1694500.0 |
| 2 | 10011 | 2007 | 1622800.0 |
| 3 | 10003 | 2007 | 1486700.0 |
| 4 | 10023 | 2007 | 1531350.0 |
Instead of having a separate column for each year in the dataset, I am transposing the dataset to include the median house price per year as a row in the dataset for each zipcode.
NY_medianprice_df_with_varience= NY_medianprice_df.groupby(NY_medianprice_df['RegionName']).agg(["var"]).reset_index()
NY_medianprice_df_with_varience.columns = [['RegionName', 'MedianValue_varience']]
NY_medianprice_df_with_varience = pd.DataFrame(NY_medianprice_df_with_varience)
NY_medianprice_df_with_varience
# #NY_medianprice_df_with_varience['MedianValue_varience'].round(decimals=3)
NY_medianprice_df_with_varience.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25 entries, 0 to 24 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 (RegionName,) 25 non-null object 1 (MedianValue_varience,) 25 non-null float64 dtypes: float64(1), object(1) memory usage: 528.0+ bytes
# Creatng a trendline of the yearly median house price per zip code in NYC.
fig = px.line(NY_medianprice_df, x='Year', y='MedianValue', color = 'RegionName',
title= 'Median Price for the Past 10 Years')
fig.show()
The above trend lines of median price of each region name shows:
# calculating the yearly percentage change in the median price in each zip code of NYC
NY_medianprice_df_with_pct = pd.DataFrame(NY_medianprice_df)
NY_medianprice_df_with_pct['pct_ch'] = pd.DataFrame((NY_medianprice_df_with_pct.groupby('RegionName')['MedianValue']
.apply(pd.Series.pct_change) + 1))
#NY_medianprice_df_with_pct
# Creating a trendline of the percent change in median house price per zip code in NYC.
fig = px.line(NY_medianprice_df_with_pct, x='Year', y='pct_ch', color = 'RegionName',
title = 'Rate of Change in the Median Price for the Past 10 Years ')
fig.show()
From the above line chart, it is difficult to discern the volatility of the median price. Hence, I will next attempt to calculate the standard deviation in the yearly median house price per zip code.
plt.figure(figsize=(30,10))
sns.set(font_scale = 2)
NY_CostPrice_Varience = sns.barplot(x="RegionName", y="MedianValue", data=NY_medianprice_df, ci="sd",palette="Blues_d",edgecolor=".2",linewidth=2.5)
NY_CostPrice_Varience.axes.set_title("Varience of Cost Price of Each Region in New York",fontsize=30)
NY_CostPrice_Varience.set_xlabel("Zip Code",fontsize=30)
NY_CostPrice_Varience.set_ylabel("Standard Deviation",fontsize=20)
sns.set(font_scale = 2)
The above bar graph certainly clarifies as to which regions are highly volatile and which one's are least volatile. Based on the standard deviation of the last 10 years median house price of the properties in every zipcode of NYC, we can determine the following:
# Computing three years average price change to be used to predict price for the year 2018, 2019,2020
output_df_with_pct = NY_medianprice_df_with_pct.loc[NY_medianprice_df['Year'].isin(['2017','2016', '2015'])]
output_df_with_3year_average_pct_chg = pd.DataFrame(output_df_with_pct.groupby('RegionName')['pct_ch'].apply(pd.Series.mean)).reset_index()
output_df_with_3year_average_pct_chg.columns = ['RegionName', '3Years_Average']
output_df_with_3year_average_pct_chg.head()
| RegionName | 3Years_Average | |
|---|---|---|
| 0 | 10003 | 1.077480 |
| 1 | 10011 | 1.069611 |
| 2 | 10013 | 1.025686 |
| 3 | 10014 | 1.062526 |
| 4 | 10021 | 1.074301 |
In order to predict the median price for the year 2018, 2019 & 2020, we calculated the average percentage change in the median price for the past three years to maintain the recency of how median house prices have changed in each zip code. Prediction of the median price for the year 2020 is needed to check the current median house price in MYC and evaluate the following:
output_2017_df = NY_medianprice_df.loc[NY_medianprice_df['Year'].isin(['2017'])].reset_index()
#output_2017_df_new = pd.concat([output_2017_df[['RegionName', 'Year', 'MedianValue']], output_df_with_3year_average_pct_chg['3Years_Average']], axis=1)
#output_2017_df_new
# defining a function that will predict the median house price for each zip given the following:
# The median house price of the last year per zip code
# The percent change in the price for that year
# The year for which we are making the prediction
def predict_median_price_for_year(year, input_median_house_price_per_year_df, input_3year_average_pct_chg_df):
# Create a new dataframe
pre_dict_median_price_for_year_df = pd.DataFrame(input_median_house_price_per_year_df['RegionName'])
# Create the year column for teh new year
pre_dict_median_price_for_year_df['Year'] = year
# Compute the mediam house price for that year
medianValue = input_median_house_price_per_year_df['MedianValue']
three_year_average_pct_chg = input_3year_average_pct_chg_df['3Years_Average']
predicted_median_price = round(medianValue * three_year_average_pct_chg, 2)
# populate the computed median price in the output dataframe
pre_dict_median_price_for_year_df['MedianValue'] = predicted_median_price
return(pre_dict_median_price_for_year_df)
# Compute the median house prices for the year 2018 by utilising the 2017 values
output_2018_df = predict_median_price_for_year('2018',output_2017_df, output_df_with_3year_average_pct_chg)
# Compute the median house prices for the year 2019 by utilising the 2018 values
output_2019_df = predict_median_price_for_year('2019',output_2018_df, output_df_with_3year_average_pct_chg)
# Compute the median house prices for the year 2020 by utilising the 2019 values
output_2020_df = predict_median_price_for_year('2020',output_2019_df, output_df_with_3year_average_pct_chg)
output_2020_df.head()
| RegionName | Year | MedianValue | |
|---|---|---|---|
| 0 | 10013 | 2020 | 4148662.27 |
| 1 | 10014 | 2020 | 3048989.59 |
| 2 | 10011 | 2020 | 2676482.75 |
| 3 | 10003 | 2020 | 2575435.98 |
| 4 | 10023 | 2020 | 2656181.63 |
# plotting the 2020 median house prices (current price) by zip code.
fig = px.bar(output_2020_df, x="RegionName", y="MedianValue", title="2020 median house price by zip code")
fig.show()
The above bar graph is clearly showing that regions like 10013, 10014, 10011 are the costliest based on median price for the year 2020. Median price of 2020 is considered to be the current cost price for the further analysis
# uploading all the listing files in the folder
import glob,os
# to get the path of the folders where all the listing files have been saved
listings_path = os.getcwd() + '\\NewYork_Airbnb\\'
listings_path
# assigning all the listing csv files to the list variable: filelist
filelist = glob.glob(listings_path + "/*.csv")
filelist
appended_listings_from_all_files = [] # instatitae the empty list
# looping through the list and appending to "appended_listings_from_all_files" list.
# While reading the files, I am excluding the headers since it was causing problems in creating the merged dataset
# I will re-add the column headers later.
for filename in filelist:
df = pd.read_csv(filename, index_col=None, header=None)
appended_listings_from_all_files.append(df)
# concating all the csv files into one dataframe: Ny_data_df
Ny_data_df = pd.concat(appended_listings_from_all_files, axis=0, ignore_index=True)
Ny_data_df
Ny_data_df.head()
Ny_data_df.shape # successfully uploaded all the listing files
(40753, 95)
# assigning the column names to the NY Airbnb dataset
Ny_data_df.columns = ['id','listing_url','scrape_id','last_scraped','name', 'summary',
'space', 'description', 'experiences_offered','neighborhood_overview',
'notes','transit', 'access','interaction','house_rules','thumbnail_url',
'medium_url','picture_url','xl_picture_url','host_id','host_url','host_name',
'host_since','host_location','host_about','host_response_time','host_response_rate',
'host_acceptance_rate','host_is_superhost','host_thumbnail_url','host_picture_url',
'host_neighbourhood','host_listings_count','host_total_listings_count','host_verifications',
'host_has_profile_pic','host_identity_verified','street','neighbourhood','neighbourhood_cleansed',
'neighbourhood_group_cleansed','city','state','zipcode','market','smart_location','country_code',
'country','latitude','longitude','is_location_exact','property_type','room_type','accommodates',
'bathrooms','bedrooms','beds','bed_type','amenities','square_feet','price','weekly_price',
'monthly_price','security_deposit','cleaning_fee','guests_included','extra_people','minimum_nights',
'maximum_nights','calendar_updated','has_availability','availability_30','availability_60',
'availability_90','availability_365','calendar_last_scraped','number_of_reviews','first_review',
'last_review','review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
'review_scores_checkin','review_scores_communication','review_scores_location',
'review_scores_value','license','jurisdiction_names','jurisdiction_names','instant_bookable','cancellation_policy',
'require_guest_profile_picture','require_guest_phone_verification',
'calculated_host_listings_count','reviews_per_month']
We have selected- 'id','city','state', 'zipcode','neighbourhood_group_cleansed','latitude', 'longitude', 'is_location_exact','price', 'property_type', 'bedrooms', 'availability_365','number_of_reviews' so that analysis can be done for New York city to find out popular neighbourhood, property type, price etc.
# selecting columns that will be used for the analysis
NY_listings_df = Ny_data_df[['id','city','state', 'zipcode','neighbourhood_group_cleansed','latitude', 'longitude', 'is_location_exact','price', 'property_type', 'bedrooms', 'availability_365','number_of_reviews']]
NY_listings_df.shape
(40753, 13)
The variables that I have considered for the analysis purpose are :'id','city','state', 'zipcode','neighbourhood_group_cleansed','latitude', 'longitude', 'is_location_exact','price', 'property_type', 'bedrooms', 'availability_365','number_of_reviews'. After retaining the above columns, we have 13 variables or columns and 40753 rows in the dataset
# to check the datatype of each column in my dataframe
NY_listings_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40753 entries, 0 to 40752 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 40753 non-null int64 1 city 40709 non-null object 2 state 40753 non-null object 3 zipcode 40142 non-null object 4 neighbourhood_group_cleansed 40753 non-null object 5 latitude 40753 non-null float64 6 longitude 40753 non-null float64 7 is_location_exact 40753 non-null object 8 price 40753 non-null object 9 property_type 40753 non-null object 10 bedrooms 40684 non-null float64 11 availability_365 40753 non-null int64 12 number_of_reviews 40753 non-null int64 dtypes: float64(3), int64(3), object(7) memory usage: 4.0+ MB
# filter the dataframe for the New York city with two bedrooms since it is already concluded that
#two bedroom properties are the most profitable
NY_listings_df = NY_listings_df[NY_listings_df['bedrooms']==2.0]
NY_listings_df.head(10) # checking the first five rows of dataset to confirm if we have data about New York City with 2 bedrooms only
NY_listings_df.tail(10) # checking the last five rows of dataset to confirm if we have data about New York City with 2 bedrooms only
| id | city | state | zipcode | neighbourhood_group_cleansed | latitude | longitude | is_location_exact | price | property_type | bedrooms | availability_365 | number_of_reviews | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 40702 | 1444826 | Brooklyn | NY | 11206 | Brooklyn | 40.702894 | -73.937281 | t | $140.00 | Apartment | 2.0 | 304 | 4 |
| 40703 | 4066224 | Brooklyn | NY | 11211 | Brooklyn | 40.712364 | -73.937896 | t | $200.00 | Apartment | 2.0 | 209 | 41 |
| 40706 | 15467579 | Brooklyn | NY | NaN | Brooklyn | 40.715428 | -73.955712 | f | $200.00 | Apartment | 2.0 | 0 | 0 |
| 40709 | 5506020 | Brooklyn | NY | 11211 | Brooklyn | 40.716312 | -73.941197 | t | $325.00 | Apartment | 2.0 | 176 | 27 |
| 40718 | 16515501 | Brooklyn | NY | 11206 | Brooklyn | 40.708927 | -73.946619 | t | $200.00 | Apartment | 2.0 | 0 | 0 |
| 40724 | 13896763 | Brooklyn | NY | 11211 | Brooklyn | 40.709601 | -73.954488 | t | $250.00 | Apartment | 2.0 | 344 | 1 |
| 40737 | 7471859 | Brooklyn | NY | 11211 | Brooklyn | 40.712802 | -73.941749 | t | $125.00 | Apartment | 2.0 | 0 | 0 |
| 40750 | 18324921 | New York | NY | 10004 | Manhattan | 40.726605 | -74.012903 | f | $600.00 | Boat | 2.0 | 309 | 0 |
| 40751 | 18035489 | New York | NY | 10029 | Manhattan | 40.793392 | -73.951867 | t | $129.00 | Apartment | 2.0 | 179 | 0 |
| 40752 | 17360085 | Fire Island | NY | 10001 | Manhattan | 40.728244 | -74.012611 | f | $200.00 | Boat | 2.0 | 308 | 0 |
# Reseting the index number
NY_listings_df.reset_index(inplace=True,drop=True)
NY_listings_df.shape# after filtering the dataset with two bedrooms we have 4894 rows and 13 columns(variables)
(4894, 13)
# changing the datatype of column 'id' from integer to string
NY_listings_df["id"]=NY_listings_df["id"].astype(str)
# removing the $, empty space,and "," from price column
NY_listings_df['price'] =NY_listings_df['price'].str.replace('\$','').replace(' ','')
NY_listings_df.head()
NY_listings_df['price'] = pd.to_numeric(NY_listings_df['price'].replace(',', '', regex=True))
The above operations are performed to change the datatype of coloumn 'id' from integers to string. Cleaned all the empty spaces, $ sign and punctuations from the column 'price' to convert it into float datatype
# to check if we have successfully cleaned our data
NY_listings_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4894 entries, 0 to 4893 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 4894 non-null object 1 city 4889 non-null object 2 state 4894 non-null object 3 zipcode 4832 non-null object 4 neighbourhood_group_cleansed 4894 non-null object 5 latitude 4894 non-null float64 6 longitude 4894 non-null float64 7 is_location_exact 4894 non-null object 8 price 4894 non-null float64 9 property_type 4894 non-null object 10 bedrooms 4894 non-null float64 11 availability_365 4894 non-null int64 12 number_of_reviews 4894 non-null int64 dtypes: float64(4), int64(2), object(7) memory usage: 497.2+ KB
The above result confirms that all my variables are assigned with correct datatypes
# calling missing_column_values function to compute the sum and percent of missing values of each column in the dataset
NY_listings_with_missingvalues = missing_column_values(NY_listings_df)
NY_listings_with_missingvalues
| index | Column_name | Missing_Value | Missing_Value_Percent | |
|---|---|---|---|---|
| 0 | id | id | 0 | 0.000000 |
| 1 | city | city | 5 | 0.102166 |
| 2 | state | state | 0 | 0.000000 |
| 3 | zipcode | zipcode | 62 | 1.266857 |
| 4 | neighbourhood_group_cleansed | neighbourhood_group_cleansed | 0 | 0.000000 |
| 5 | latitude | latitude | 0 | 0.000000 |
| 6 | longitude | longitude | 0 | 0.000000 |
| 7 | is_location_exact | is_location_exact | 0 | 0.000000 |
| 8 | price | price | 0 | 0.000000 |
| 9 | property_type | property_type | 0 | 0.000000 |
| 10 | bedrooms | bedrooms | 0 | 0.000000 |
| 11 | availability_365 | availability_365 | 0 | 0.000000 |
| 12 | number_of_reviews | number_of_reviews | 0 | 0.000000 |
The above results is showing the sum and percentages of missing values in each columns. Since, count and percentage of missing rows is very low as compared to number of data we have in our dataset. So, it is decided to drop all the null rows from the dataframe
# dropping all the columns with null value(s)
NY_listings_df.dropna(axis=0, inplace = True)
count_nan = NY_listings_df.isna().sum().sum()
print(count_nan)
# Calling missing_column_values function again to check if there are any columns with misssing values.
# Printing any columns that have missing values.
NY_listings_with_missingvalues = missing_column_values(zillow_ny_df)
NY_listings_with_missingvalues[NY_listings_with_missingvalues['Missing_Value_Percent'] > 0].count()
0
index 0 Column_name 0 Missing_Value 0 Missing_Value_Percent 0 dtype: int64
Successfully deleted all the rows with null values
NY_listings_df.shape
(4827, 13)
Our final NewYork listings dataset has 13 columns ans 4827 rows
# merging the two dataframe based on inner join.
#I am merging the Median House Income dataset for 2020 and merging it with the Airbnb dataset.
# The assumption is that the Airbnb dataset is for 2020.
final_df = NY_listings_df.merge(output_2020_df, how = 'inner', left_on = 'zipcode', right_on = 'RegionName')
# Renaming column MedianValue to Cost
final_df.rename(columns={"MedianValue": "Cost"}, inplace = True)
final_df.tail()
| id | city | state | zipcode | neighbourhood_group_cleansed | latitude | longitude | is_location_exact | price | property_type | bedrooms | availability_365 | number_of_reviews | RegionName | Year | Cost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1118 | 10708999 | New York | NY | 10023 | Manhattan | 40.775973 | -73.987639 | t | 400.0 | Apartment | 2.0 | 38 | 10 | 10023 | 2020 | 2656181.63 |
| 1119 | 287492 | New York | NY | 10023 | Manhattan | 40.780634 | -73.980551 | t | 220.0 | Apartment | 2.0 | 0 | 6 | 10023 | 2020 | 2656181.63 |
| 1120 | 12273360 | New York | NY | 10023 | Manhattan | 40.781000 | -73.976975 | f | 129.0 | Apartment | 2.0 | 327 | 23 | 10023 | 2020 | 2656181.63 |
| 1121 | 9608639 | New York | NY | 10023 | Manhattan | 40.778844 | -73.978558 | t | 350.0 | Apartment | 2.0 | 206 | 65 | 10023 | 2020 | 2656181.63 |
| 1122 | 13113181 | New York | NY | 10023 | Manhattan | 40.779661 | -73.974597 | t | 500.0 | Townhouse | 2.0 | 62 | 15 | 10023 | 2020 | 2656181.63 |
data = [dict(
type = 'bar',
x = final_df['neighbourhood_group_cleansed'],
y = final_df['price'],
mode = 'markers',
color='time',
transforms = [dict(
type = 'aggregate',
groups = final_df['neighbourhood_group_cleansed'],
aggregations = [dict(
target = 'y', func = 'avg', enabled = True),
]
)]
)]
layout = dict(
title = 'Average Nightly Price of Properties per Neighbourhood',
xaxis = dict(title = 'Neighbourhood'),
yaxis = dict(title = 'AVG. Nightly Price'),
)
fig_dict = dict(data=data, layout=layout)
pio.show(fig_dict, validate=False)
The above bar graph shows that the average nightly price is the highest at Manhattan neighbourhood followed by Brooklyn, Staten Island, and Queens
Cost_Per_Neigbhourhood_df = final_df.groupby(["neighbourhood_group_cleansed"])['Cost'].mean().reset_index()
fig = px.bar(Cost_Per_Neigbhourhood_df, x="neighbourhood_group_cleansed", y="Cost",title="Cost Price Per Neighbourhood",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Neighbourhood",
yaxis_title="Cost Price",
)
fig.show()
As per the cost price, Manhattan neighborhood is leading and followed by Brooklyn, Staten Island, and Queens. Although, Manhattan is the costliest neighborhood but revenue from rentals is also the highest. However, in order to arrive at which neighborhood is the most profitable one, we need to dig deeper to find out annual rate of return per neighborhood. The rental price in a neighborhood is corelated to the cost of that property in the neighborhood.
mean_price_perZipcode_df = final_df.groupby(['zipcode'])['price'].mean().reset_index()
fig = px.bar(mean_price_perZipcode_df, x="zipcode", y="price",title="Average Price Per ZipCode",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
title="Average Nightly Price Per ZipCode",
xaxis_title="Zipcode",
yaxis_title="Average Nightly Price",
)
fig.show()
The above result shows which zip codes have the most and the least potential of earning daily rental revenue
avgPrice_perZipcodeByNeighbhourhood_df = final_df.groupby(['zipcode','neighbourhood_group_cleansed'])['price'].mean().reset_index()
fig = px.bar(avgPrice_perZipcodeByNeighbhourhood_df, x="zipcode", y="price", color = 'neighbourhood_group_cleansed',title="Average Price Per ZipCode by Neighbourhood",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Zipcode",
yaxis_title="Average Price",
)
fig.show()
The above bar chart shows that out of 22 zip codes in the New York City dataset, Manhattan neighborhood has the greatest number of zip code and Queens has only one zip code. This clearly means that within Manhattan, there are a lot of popular zip codes where rental properties are in high demand. However, in Queen there is only a single zip code which is popular for rentals. For both Staten Island and Brooklyn, there are 5 zip codes each that are popular for rentals. The above figure is also clearly showing as to which zip codes by neighborhood has the most potential for rental income and has the highest nightly rental prices. For example, zip code 10011 in Manhattan seems to be the most expensive in terms of average nightly rental price. However, to determine profits, we will also need to consider the cost price of the properties in each zip code and then compare it with the average nightly rental price.
from plotly.graph_objs import *
import plotly.figure_factory as ff
fig = px.box(final_df, x="zipcode", y="price", title = 'Variability of Nightly Price per Zipcode')
fig.update_layout(
autosize=False,
width=1000,
height=1000,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
)
)
fig.show()
The above boxplots shows that the zip code 10036 has the highest nightly price range with min: 58 and max: 4700. The next zip code with the highest nightly price range is 10003 (min: 60, max: 3750). However, both zip codes have outliers which is causing the range to be inflated. However, if we exclude zip codes with outliers, the zip codes with the highest variability are the following zip codes and I am listing them in descending order of variability: 10011, 10013, 10036 and 10128. Zip codes such as 10312, 10304, 10308 don't have any price variability. We need to dig further to find out in which neighborhood they are falling.
fig = px.box(final_df, x="zipcode", y="price", color = 'neighbourhood_group_cleansed', title = 'Variability of Nightly Price per Zipcode by Neighbourhood')
fig.update_layout(
autosize=False,
width=1000,
height=1000,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
)
)
fig.show()
The above results made it clear that zip codes falling in Manhattan neighborhood has the most price variability which indicates that the highest fluctuation in rental price is in that neighborhood. Neighborhood such as 'Staten Island' has the least variability of price. High nightly price variability also indicates that potential for higher risk in Manhattan due to fluctuations in price. However, the fluctuation in price might also indicate seasonality with rental rates being higher in the summer as compared to winters due to variability in demand. Hence, we should keep in mind that the cash flow from rental properties in these two neighborhoods will fluctuate and should be factored into cash flow projections in the future.
fig = px.histogram(final_df, x="zipcode", color_discrete_sequence=['indianred'])
fig.update_layout(
title="Number of Properties by Zipcode",
xaxis_title="Zipcode",
yaxis_title="Number of Properties",
)
fig.show()
The above bar graph shows that the zipcode:11215 has the highest number of properties and zipcode:10312, 10304 has the least number of properties. We need to find out in which neighborhood these zip codes are located. But it is clear from this data that these are the most popular zip codes for rentals.
fig = px.histogram(final_df, x="zipcode", color = "neighbourhood_group_cleansed")
fig.update_layout(
title="Number of Properties per Zipcode by Neighbourhood",
xaxis_title="Zipcode",
yaxis_title="Number of Properties",
)
fig.show()
The above bar graph shows that the zip code 11215 which has the highest number of properties, is in Brooklyn neighborhood. The graph makes it clear that the neighborhoods that are the most popular in terms of short-term rentals are Brooklyn and Manhattan. The highest potential for investment in properties for short term rentals is in these 2 neighborhoods because they have the highest demand. We will need to check for the rate of return in these neighborhoods by zip code to determine the most profitable zip codes to invest in.
fig = px.histogram(final_df, x="property_type", color_discrete_sequence=['indianred'])
fig.update_layout(
title="Popular Property type",
xaxis_title="property_type",
yaxis_title="Number of Each Property types",
)
fig.show()
The above result indicate that apartment is the most popular property type by a long distance. We will next determine which neighborhoods have the most apartments.
plt.figure(figsize=(30,25))
sns.set(font_scale = 2)
sns.histplot(binwidth=0.5, x="neighbourhood_group_cleansed", hue="property_type", data=final_df, stat="count", multiple="stack")
plt.show()
It is clear from the above graph that Manhattan has the highest number of apartments followed by Brooklyn. Hence, Brooklyn and Manhattan are the most popular neighborhoods with the greatest number of apartments available for short-term rentals (which is the most popular property type for short term rentals). Hence, it is prudent to invest in apartments type properties in Manhattan Brooklyn.
ReviewsbyZipcode_df = final_df.groupby(["zipcode"])['number_of_reviews'].mean().reset_index()
fig = px.bar(ReviewsbyZipcode_df, x="zipcode", y="number_of_reviews",title="Average number Reviews Per ZipCode",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Zipcode",
yaxis_title="Average number of Reviews",
)
fig.show()
The above result shows the zipcodes that are in top five in terms of popularity are: 10306, 10308, 10003, 10022,10036
# Calculation of Occupancy, Annual rent, gross rent multiplier, and annual return to be used as key metrics for evaluation
final_df['Occupancy'] = 0.75 * 365 # assumed occupancy to be 75% of the entire year
final_df['Annual_Rent'] = final_df['price'] * final_df['Occupancy']
final_df['Gross_Rent_Multiplier'] = final_df['Cost'] / final_df['Annual_Rent']
final_df['Annual_Return'] = ((final_df['Annual_Rent'] / final_df['Cost'])*100).apply(np.ceil)
final_df.head()
| id | city | state | zipcode | neighbourhood_group_cleansed | latitude | longitude | is_location_exact | price | property_type | bedrooms | availability_365 | number_of_reviews | RegionName | Year | Cost | Occupancy | Annual_Rent | Gross_Rent_Multiplier | Annual_Return | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15910991 | Staten Island | NY | 10305 | Staten Island | 40.595265 | -74.064715 | t | 120.0 | House | 2.0 | 0 | 0 | 10305 | 2020 | 521088.19 | 273.75 | 32850.00 | 15.862654 | 7.0 |
| 1 | 17716523 | Staten Island | NY | 10305 | Staten Island | 40.593469 | -74.069137 | t | 110.0 | Apartment | 2.0 | 197 | 0 | 10305 | 2020 | 521088.19 | 273.75 | 30112.50 | 17.304714 | 6.0 |
| 2 | 16155769 | Staten Island | NY | 10305 | Staten Island | 40.596528 | -74.063712 | f | 150.0 | House | 2.0 | 0 | 0 | 10305 | 2020 | 521088.19 | 273.75 | 41062.50 | 12.690123 | 8.0 |
| 3 | 13370393 | Staten Island | NY | 10305 | Staten Island | 40.605556 | -74.082740 | t | 150.0 | Apartment | 2.0 | 83 | 0 | 10305 | 2020 | 521088.19 | 273.75 | 41062.50 | 12.690123 | 8.0 |
| 4 | 2611458 | Staten Island | NY | 10305 | Staten Island | 40.603750 | -74.080646 | t | 129.0 | House | 2.0 | 49 | 13 | 10305 | 2020 | 521088.19 | 273.75 | 35313.75 | 14.755957 | 7.0 |
AnnualRORbyZipcode_df = final_df.groupby(["zipcode"])['Annual_Return'].mean().reset_index()
fig = px.bar(AnnualRORbyZipcode_df, x="zipcode", y="Annual_Return",title=" Average Annual Return Per ZipCode",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Zipcode",
yaxis_title="Average Annual rate of Return",
)
fig.show()
As per the above results, zipcode 10312 has the highest average annual rate of return with almost 14% followed by the zipcodes: 10304(9%), 11434(7%), 10036(6.2%).
GRMbyZipcode_df = final_df.groupby(["zipcode"])['Gross_Rent_Multiplier'].mean().reset_index()
fig = px.bar(GRMbyZipcode_df, x="zipcode", y="Gross_Rent_Multiplier",title=" Average Gross Rent Multiplier Per ZipCode",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Zipcode",
yaxis_title="Average Gross Rent Multiplier",
)
fig.show()
The GRM calculation compares the property’s cost price or fair market value to the gross rental income. Using the gross rent multiplier is a good way to take a “quick look” at how fast the property will be paid off from the gross rent the property is generating. Lower the better. Based on this metric, zipcode 10312 is leading
To evaluate in which zip codes to invest, we will measure how each zip code performs against each of the following metrics:
In conclusion, my methodology resulted in zip codes that satisfy 3 of the metrics at most. Based on this analysis, the real estate company should prioritize investment in the following zip codes in NYC for short-term properties: